## Make colors for plots
fabcolors = RColorBrewer::brewer.pal(n = 11,name = 'RdGy')
col1 = RColorBrewer::brewer.pal(n = 10,name = 'PRGn')
col2 = RColorBrewer::brewer.pal(n = 10,name = 'Spectral')
col3 = RColorBrewer::brewer.pal(n = 10,name = 'BrBG')
col4 = RColorBrewer::brewer.pal(n = 10,name = 'PiYG')
col5 = RColorBrewer::brewer.pal(n = 10,name = 'PuOr')
col6 = RColorBrewer::brewer.pal(n = 10,name = 'RdBu')


allcolors <- c(fabcolors, col1,col2,col3, col4, col5, col6)
allcolors <- list(allcolors)

morecolors1 <- wes_palette("Darjeeling1", n=4, type = "discrete")
morecolors1 <- list(morecolors1)

morecolors2 <- wes_palette("Moonrise2", n=3, type = "discrete")
morecolors2 <- list(morecolors2)

color_list <- c(allcolors, morecolors1, morecolors2)

0.1 Introduction

This document describes training a random forest model using latent variables generated by transfer learning approaches as its input features. We are using the latent variables found in NF1 tumor data to train the forest to find the most important classifying LVs for the four tumor types in NF1. We chose to train a random forest classifier to identify NF tumortypes using the LVs for the following features of the model :

  • robustness to high dimensionality data
  • ability to handle unbalanced classes
  • robustness to outliers and non-linear data
  • quick training /prediction speeds
  • low bias and moderate variance

Our goal is to find important latent variables that classify the various tumorTypes. We will then inspect the classifying features to find meaningful genesets that distinguish between two tumortypes.

Tumortypes represented in the data:

  • Plexiform Neurofibroma
  • MPNST
  • Cutaneous Neurofibroma
  • Neurofibroma
## Load initial Fit
load(synGet("syn21317737")$path)

## Load Fit data (All the ensemble models described in this document are stored on Synapse)
load(synGet("syn21317654")$path)

0.2 Partitioning the data into training and testing set:

We first partition the data into a holdout set and a model set. The model set (newforest) will be further partitioned into training set and test set by iterative random sampling that will be used to train and test the ensemble models during its iterations. The holdout set will be used to test the ensemble models at the very end. The holdout test set will be naive set that will not be seen by any of the models until the very end.

#Make the test and training datasets
set.seed(998)  #(if you want to keep the exact same training and testing dataset then uncomment this line)
inTraining <- createDataPartition(as.factor(forest_data$tumorType), p = .70, list = FALSE)

newforest <- forest_data[ inTraining,]
holdout  <- forest_data[-inTraining,]

0.2.1 Initial model to find optimum mtry and ntree

The newforest dataset was split into 75% training and 25% testing dataset. The function createDataPartition is used to create balanced splits of the data. Since the tumorType argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data.

#Make the test and training datasets
set.seed(998)  #(if you want to keep the exact same training and testing dataset then uncomment this line)
inTraining <- createDataPartition(as.factor(newforest$tumorType), p = .75, list = FALSE)

training <- forest_data[ inTraining,]
testing  <- forest_data[-inTraining,]

0.2.1.1 Model training and Crossvalidation :

We first trained an initial model on the training dataset using iterative mtrys (1:100) and ntrees (250, 500, 1000, 2000) trees. The model was crossvalidated using 5-fold crossvalidation technique. The details of the model are given below:

# 10 fold validation control
fitControl <- trainControl(## 5-fold CV
                           method = "repeatedcv",
                           number = 5,
                           ## repeated ten times
                           repeats = 5)

n <- sqrt(ncol(forest_data))
tunegrid <- expand.grid(.mtry = c(1:100))

#Find the classes:
summary(training$tumorType)
##                  Cutaneous Neurofibroma 
##                                      10 
## Malignant Peripheral Nerve Sheath Tumor 
##                                       8 
##                            Neurofibroma 
##                                      11 
##                  Plexiform Neurofibroma 
##                                      15
## Construct the initial random forest model called Fit (the code is commented out to facilitate quick rendering of html file by loading the Fit from Synapse)

# #start parallelization
# cl <- makePSOCKcluster(10)
# registerDoParallel(cl)
# 
# set.seed(9998)
# Fit <- train(tumorType ~ .,
#              data = training[,c(1:ncol(training))],
#              method = "rf",
#              ntree= 1000,
#              tuneGrid = tunegrid,
#              proximity=TRUE,
#              importance = TRUE,
#              trControl = fitControl,
#              verbose = TRUE)
# 
# 
# ## When you are done:
# stopCluster(cl)

print(" Check the fit of the initial model")                 
## [1] " Check the fit of the initial model"
Fit
## Random Forest 
## 
##  44 samples
## 962 predictors
##   4 classes: 'Cutaneous Neurofibroma', 'Malignant Peripheral Nerve Sheath Tumor', 'Neurofibroma', 'Plexiform Neurofibroma' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 34, 35, 36, 35, 36, 36, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##     1   0.7127778  0.6046369
##     2   0.7117778  0.6023586
##     3   0.7127778  0.6041557
##     4   0.7117778  0.6015444
##     5   0.7122222  0.6030109
##     6   0.7212222  0.6160123
##     7   0.7083333  0.5974228
##     8   0.7172222  0.6105570
##     9   0.7167778  0.6091403
##    10   0.7167778  0.6093437
##    11   0.7212222  0.6164057
##    12   0.7127778  0.6037349
##    13   0.7172222  0.6106069
##    14   0.7117778  0.6015444
##    15   0.7127778  0.6034299
##    16   0.7022222  0.5876858
##    17   0.7122222  0.6030109
##    18   0.7077778  0.5961390
##    19   0.7212222  0.6160123
##    20   0.7172222  0.6103018
##    21   0.7167778  0.6095612
##    22   0.7167778  0.6091403
##    23   0.7172222  0.6111970
##    24   0.7212222  0.6164057
##    25   0.7167778  0.6100423
##    26   0.7122222  0.6035194
##    27   0.7167778  0.6091403
##    28   0.7127778  0.6037349
##    29   0.7127778  0.6037349
##    30   0.7127778  0.6034299
##    31   0.7212222  0.6160123
##    32   0.7127778  0.6038507
##    33   0.7037778  0.5905834
##    34   0.7122222  0.6032310
##    35   0.7122222  0.6032063
##    36   0.7162222  0.6092019
##    37   0.7167778  0.6088353
##    38   0.7117778  0.6017398
##    39   0.7072222  0.5964246
##    40   0.7122222  0.6032063
##    41   0.7167778  0.6088353
##    42   0.7167778  0.6091403
##    43   0.7072222  0.5955253
##    44   0.7077778  0.5961390
##    45   0.7117778  0.6021579
##    46   0.7162222  0.6086364
##    47   0.7077778  0.5961390
##    48   0.7077778  0.5958339
##    49   0.7122222  0.6036244
##    50   0.7127778  0.6042034
##    51   0.7217778  0.6156438
##    52   0.7127778  0.6036731
##    53   0.7033333  0.5900406
##    54   0.7172222  0.6103018
##    55   0.7127778  0.6037349
##    56   0.7127778  0.6038507
##    57   0.7122222  0.6034043
##    58   0.7127778  0.6037349
##    59   0.7172222  0.6106069
##    60   0.7077778  0.5963344
##    61   0.7122222  0.6032063
##    62   0.7027778  0.5887631
##    63   0.7077778  0.5961842
##    64   0.7127778  0.6037349
##    65   0.7077778  0.5960540
##    66   0.7027778  0.5881897
##    67   0.7067778  0.5943472
##    68   0.7212222  0.6160123
##    69   0.7127778  0.6034299
##    70   0.7077778  0.5963344
##    71   0.7172222  0.6103018
##    72   0.7077778  0.5963591
##    73   0.7112222  0.6012358
##    74   0.7117778  0.6017398
##    75   0.7087778  0.5986879
##    76   0.7023333  0.5884523
##    77   0.7027778  0.5881711
##    78   0.6983333  0.5817251
##    79   0.7037778  0.5911969
##    80   0.7122222  0.6032143
##    81   0.7127778  0.6034299
##    82   0.7132222  0.6051397
##    83   0.7117778  0.6019445
##    84   0.7067778  0.5950978
##    85   0.7132222  0.6052480
##    86   0.7127778  0.6034299
##    87   0.7127778  0.6039317
##    88   0.7027778  0.5886534
##    89   0.7072222  0.5955253
##    90   0.7032222  0.5899698
##    91   0.7027778  0.5886534
##    92   0.7032222  0.5900795
##    93   0.7067778  0.5951782
##    94   0.7027778  0.5886534
##    95   0.7117778  0.6015444
##    96   0.7027778  0.5890469
##    97   0.7127778  0.6034299
##    98   0.7067778  0.5953629
##    99   0.7073333  0.5951409
##   100   0.7122222  0.6034043
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 51.
# Select final number of ntrees
# accuracy <- vector()
# besttune <- vector()
# iter <- c(250, 500, 1000, 2000)
# accuracy[4] <- max(Fit$results$Accuracy)
# besttune[4] <- Fit$bestTune$mtry
# 
# ROC <- data.frame(iter=numeric(),
#                    accuracy=numeric())
# ROC <- as.data.frame(cbind(iter,accuracy, besttune))
# 
# # ROC curve:
# theme_update(text = element_text(size=15))
# ggplot(ROC,  aes(x=iter, y=accuracy)) +
#   geom_point(aes(size=1)) +
#   geom_line() +
#   coord_cartesian(ylim = c(0,1)) +
#   labs(main="The model", x="ntrees :: Number of trees in the forest", y= "Accuracy of the model") 

#Highest accuracy <- ntrees = 1000

#plot the model
theme_update(text = element_text(size=15))
ggplot(Fit$results,  aes(x=mtry, y=Accuracy)) +
  geom_point(aes(size=1)) +
  geom_line() +
  coord_cartesian(ylim = c(0,1)) +
  labs(main="The model", x="mtry :: Number of features for each split", y= "Accuracy of the model") 

print("Check the clustering of the samples according to the model")
## [1] "Check the clustering of the samples according to the model"
MDSplot(Fit$finalModel, 
        as.factor(training$tumorType), 
        k=2, palette=NULL, 
        pch=as.numeric(training$tumorType), 
        cex=1, 
        cex.axis= 1.1,
        cex.lab = 1.1,
        cex.main = 1.1,
        main= "MDS Plot of the initial training set")
legend("topright",
       inset=0.01, 
       cex= 1.1,
       legend=levels(training$tumorType), 
       fill=brewer.pal(6, "Set1"))

#Order errors using OOB
#head(Fit$finalModel$err.rate[order(Fit$finalModel$err.rate[,1]),])
#Use model to predict labels of test data
pred <- predict(Fit, newdata = testing[,c(1:length(colnames(testing)))])

#store predicted labels in the test dataframe
testing$pred <- pred
# Check the accuracy of the model
library(DT)

conf_matrix <- confusionMatrix(data = testing$pred, 
                              reference = as.factor(testing$tumorType), 
                              mode = "prec_recall")

## Make a performance histogram from initial iterations of the forest

perf <- as.data.frame(conf_matrix$byClass)
perf$Class <- rownames(perf)
perf <- perf %>%
  dplyr::select(Class, F1)

# estimate variable importance
importance <- varImp(Fit, scale=TRUE)

# Select top important features
list_init <- as.data.frame(importance$importance)


varImpPlot(Fit$finalModel,
           main = "Important variables in the forest",
           n.var = 80,
           type = 2)

# DT:: datatable(list_init)

#conf_matrix$table

0.2.2 Iterating over models

We observed that tuning our model did not significantly improve the performance of the initial model. As a result we tried an ensemble approach where we ran 500 randomized iterations of our forest with the optimum mtrys we found from our initial model. In each iteration, a new randomly sampled training set and testing set was generated to train and test an independent model. We then plotted the performance of all the 500 independent models to get a distribution of F1 scores for each class. A higher median F1 score for a class overall would mean a higher classification accuracy for that class.

# FeatureList <- list()
# perf_new <- perf

# Load the Featurelist and perf list stored on synapse
#load(synGet("syn21267354")$path)

#The model building code has been commented out below for quick rendering of html

# for (i in 1:500){
#   # make new train-test set
#   inTraining <- createDataPartition(as.factor(newforest$tumorType), p = .75, list = FALSE)
#   training <- forest_data[ inTraining,]
#   testing  <- forest_data[-inTraining,]
#   
#   tunegrid <- expand.grid(.mtry = 51)
# 
#   #start parallelization
#   cl <- makePSOCKcluster(10)
#   registerDoParallel(cl)
#   
#   #make new model
#   Fit_new <- train(tumorType ~ .,
#              data = training[,c(1:ncol(training))],
#              method = "rf",
#              ntree= 1000,
#              #mtry = 51,            #using mtry=51 as seen as best tune for initial Fit
#              tuneGrid = tunegrid,
#              #classwt =
#              proximity=TRUE,
#              importance = TRUE,
#              trControl = fitControl,
#              verbose = TRUE)
#   
#   # When you are done:
#   stopCluster(cl)
# 
#   # predict test set with the model to get F1 scores
#   pred <- predict(Fit_new, newdata = testing[,c(1:length(colnames(testing)))])
#   #store predicted labels in the test dataframe
#   testing$pred <- pred
# 
#   #Make confusion matrix
#   conf_matrix <- confusionMatrix(data = testing$pred,
#                               reference = as.factor(testing$tumorType),
#                               mode = "prec_recall")
# 
#   ## Store F1 scores from various iterations of the forest
#   df <- as.data.frame(conf_matrix$byClass)
#   perf_new[, glue('Iter{i}')] <- df$F1
#   #perf[, Class] <- rownames(df)
# 
# 
#   #Store Feature importance for all iterations in a list
#   # estimate variable importance
#   importance <- varImp(Fit_new, scale=TRUE)
# 
#   # Select top important features
#   features <- as.data.frame(importance$importance)
# 
#   FeatureList[[i]] <- (features)
# }


# Plot histogram of all F1 scores
#Make long df
perf_new$Class <- as.factor(perf_new$Class)
perf_new_long <- gather(perf_new, iteration, All_scores, F1:Iter500, factor_key=TRUE)

par(mfrow=c(2,1)) 

theme_update(legend.text = element_text(size=8), 
      axis.text.x  = element_text(size=10),
      axis.text.y = element_text(size=10),
      text = element_text(size=10))

ggplot(perf_new_long, aes(x=All_scores, fill=Class, color= Class)) + 
  geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
  theme(legend.position="top") +
  scale_color_brewer(palette="Spectral")+
  scale_fill_brewer(palette="Spectral") +
  labs(title="Histogram of raw F1 scores for iterations of RF",x="F1 scores from different iterations of RF", y = "Number of RF iterations with a given F1 score") +
  xlim(0, 1)

ggplot(perf_new_long, aes(x=All_scores, fill=Class, color=Class)) + 
  #geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
  geom_density(alpha=0.5) +
  theme(legend.position="top") +
  #scale_color_manual(values=allcolors[[1]][12:18])
  scale_color_brewer(palette="Spectral")+
  scale_fill_brewer(palette="Spectral") +
  labs(title="Density plot of F1 scores for iterations of RF",x="F1 scores from different iterations of RF", y = "Proportions of RF iterations with a given F1 score") +
  xlim (0,1)

# print("The median F1 scores for first ensemble")
# table(rowMedians(as.matrix(perf_new[,c(2:ncol(perf_new))])))

0.2.2.1 Top 40 important variables for the first ensemble model

Lets take a look at the important latent variables picked up by our models as top classifiers for the different classes.

# Filter Importance scores of features for each Class
features_cNF <- as.data.frame(sapply(FeatureList, `[[`, 1))
features_MPNST <- as.data.frame(sapply(FeatureList, `[[`, 2))
features_NF <- as.data.frame(sapply(FeatureList, `[[`, 3))
features_pNF <- as.data.frame(sapply(FeatureList, `[[`, 4))

# # Plot the drop of importance in features in cNF
# data <- features_cNF[order(-features_cNF$V1),]
# data$id <- rownames(data)
# data <- data %>% mutate(index = row_number())
# data <- data[,c("V1","id", "index")]
# ggplot(data = data,
#        aes(x=index, y=V1)) +
#   geom_point()

## From this plot the importance seems to stagnate


# Plot the distribution of importance scores of features
ridgeplot_classes <- function(featurelist, dataframe, class){
  #Take row median
  dataframe$median <- rowMedians(as.matrix(dataframe[,]))
  #add Class column
  dataframe$Celltype <- rownames(FeatureList[[1]])
  #Take top50 rows
  dataframe <- dataframe[order(-dataframe$median),]
  new_df <- dataframe[(1:40),]
  #make long df
  dataframe_long <- gather(new_df, iteration, All_scores, V1:V500, factor_key=TRUE)
  
  #save the ordered df
  var_name <- glue('ordered_{class}') # Construct the name
  assign(var_name, new_df, env=.GlobalEnv)
 
  #make ridgeplot
  theme_update(legend.text = element_text(size=8), 
      axis.text.x  = element_text(size=15),
      axis.text.y = element_text(size=10),
      text = element_text(size=15))
  ggplot(dataframe_long, aes(x=All_scores, y=Celltype, fill=Celltype)) + 
  geom_density_ridges(scale = 3, rel_min_height = 0.01, alpha= 0.5)  +
  theme(legend.position="none") +
  scale_color_manual(values=color_list[[1]]) +
  labs(title= glue('{class}: Top 40 Features'),x="Importance Score", y = "Proportion of RFs") +
  xlim(-10,100)
  
}

ridgeplot_classes(FeatureList, features_cNF, "Cutaneous Neurofibroma")
## Picking joint bandwidth of 2.26

ridgeplot_classes(FeatureList, features_MPNST, "MPNST")
## Picking joint bandwidth of 2.62

ridgeplot_classes(FeatureList, features_NF, "Neurofibroma")
## Picking joint bandwidth of 2.43

ridgeplot_classes(FeatureList, features_pNF, "Plexiform Neurofibroma")
## Picking joint bandwidth of 2.5

0.2.3 Testing sufficiency of top features as predictors of classes

Since the mean decrease in Gini index flattens out after the top 40 features, we selected the top 40 features from all the classes and took their union to train a second ensemble of random forests.

# Take all top features for all classes
`ordered_Cutaneous Neurofibroma`$Celltype <- gsub("`", "", `ordered_Cutaneous Neurofibroma`$Celltype)
ordered_MPNST$Celltype <- gsub("`", "", ordered_MPNST$Celltype)
ordered_Neurofibroma$Celltype <- gsub("`", "", ordered_Neurofibroma$Celltype)
`ordered_Plexiform Neurofibroma`$Celltype <- gsub("`", "", `ordered_Plexiform Neurofibroma`$Celltype)

allfeatures <- unique(c(`ordered_Cutaneous Neurofibroma`$Celltype, ordered_MPNST$Celltype, ordered_Neurofibroma$Celltype, `ordered_Plexiform Neurofibroma`$Celltype))
allfeatures <- gsub("`", "", allfeatures)

commonfeatures1 <- intersect(`ordered_Cutaneous Neurofibroma`$Celltype, ordered_MPNST$Celltype)
commonfeatures2 <- intersect(ordered_Neurofibroma$Celltype, `ordered_Plexiform Neurofibroma`$Celltype)
commonfeatures <- intersect(commonfeatures1,commonfeatures2)


full_feature_list <- c(`ordered_Cutaneous Neurofibroma`$Celltype, ordered_MPNST$Celltype, ordered_Neurofibroma$Celltype, `ordered_Plexiform Neurofibroma`$Celltype)
full_feature_list <- gsub("`", "", full_feature_list)

To build the ensemble of forests, only selected features were provided as input to the models, the data was randomly sampled at each iteration of model building to generate a new training set at each iteration, and the initial holdout set (that was never used in any of the previous model building iterations) was used as the test set to evaluate performace of each of the 500 models.

# FeatureList_final <- list()
# perf_final <- perf

## Restrict features from train and test set
keep <- colnames(newforest) %in% allfeatures
final_model_data <- newforest[,keep]
final_model_data$tumorType <- newforest$tumorType

keep <- colnames(holdout) %in% allfeatures
final_holdout <- holdout[,keep]
final_holdout$tumorType <- holdout$tumorType

#The model building code has been commented out below for quick rendering of html

 # for (i in 1:500){
 #  # make new train-test set
 #  inTraining <- createDataPartition(as.factor(final_model_data$tumorType), p = .75, list = FALSE)
 #  training <- final_model_data[ inTraining,]
 #  testing  <- final_holdout #final_model_data[-inTraining,]  #use the holdout test data that none of the potential models have seen before
 # 
 #  #start parallelization
 #  cl <- makePSOCKcluster(10)
 #  registerDoParallel(cl)
 # 
 #  #make new model
 #  Fit_new <- train(tumorType ~ .,
 #             data = training[,c(1:ncol(training))],
 #             method = "rf",
 #             ntree= 1000,
 #             #mtry = 45,
 #             tuneGrid = tunegrid,
 #             #classwt =
 #             proximity=TRUE,
 #             importance = TRUE,
 #             trControl = fitControl,
 #             verbose = TRUE)
 # 
 #  # When you are done:
 #  stopCluster(cl)
 # 
 #  # predict test set with the model to get F1 scores
 #  pred <- predict(Fit_new, newdata = testing[,c(1:length(colnames(testing)))])
 #  #store predicted labels in the test dataframe
 #  testing$pred <- pred
 # 
 #  #Make confusion matrix
 #  conf_matrix <- confusionMatrix(data = testing$pred,
 #                              reference = as.factor(testing$tumorType),
 #                              mode = "prec_recall")
 # 
 #  ## Store F1 scores from various iterations of the forest
 #  df <- as.data.frame(conf_matrix$byClass)
 #  perf_final[, glue('Iter{i}')] <- df$F1
 #  #perf[, Class] <- rownames(df)
 # 
 # 
 #  #Store Feature importance for all iterations in a list
 #  # estimate variable importance
 #  importance <- varImp(Fit_new, scale=TRUE)
 # 
 #  # Select top important features
 #  features <- as.data.frame(importance$importance)
 # 
 #  FeatureList_final[[i]] <- (features)
 # }

df <- perf_final

# Plot histogram of all F1 scores
#Make long df
perf_final_long <- gather(perf_final, iteration, All_scores, 3:ncol(perf_final), factor_key=TRUE)

par(mfrow=c(2,1)) 

theme_update(legend.text = element_text(size=8), 
      axis.text.x  = element_text(size=10),
      axis.text.y = element_text(size=10),
      text = element_text(size=10))

ggplot(perf_final_long, aes(x=All_scores, fill=Class, color=Class)) + 
  #geom_histogram( binwidth=0.05, alpha=0.5, position="dodge") +
  geom_density(alpha=0.5) +
  theme(legend.position="top") +
  #scale_color_manual(values=allcolors[[1]][12:18])
  scale_color_brewer(palette="Spectral")+
  scale_fill_brewer(palette="Spectral") +
  labs(title="Density plot of F1 scores for iterations of RF",x="F1 scores from different iterations of RF", y = "Proportions of RF iterations with a given F1 score") +
  xlim (0,1)

We found that the new ensemble of 500 forests fared better with the restricted feature-set with most of the classes showing an improvement in their median F1 scores. This suggested to us that the top 40 features from each class were sufficient for efficient classification of the tumortypes.

# Filter Importance scores of features for each Class
final_cNF <- as.data.frame(sapply(FeatureList_final, `[[`, 1))
final_MPNST <- as.data.frame(sapply(FeatureList_final, `[[`, 2))
final_NF <- as.data.frame(sapply(FeatureList_final, `[[`, 3))
final_pNF <- as.data.frame(sapply(FeatureList_final, `[[`, 4))


#Calculate median scores for each
# Plot the distribution of importance scores of features
ridgeplot_classes <- function(featurelist, dataframe, class){
  #Take row median
  dataframe$median <- rowMedians(as.matrix(dataframe[,]))
  #add Class column
  dataframe$Celltype <- rownames(featurelist[[1]])
  #Take top50 rows
  dataframe <- dataframe[order(-dataframe$median),]
  new_df <- dataframe[(1:10),]
  #make long df
  dataframe_long <- gather(new_df, iteration, All_scores, V1:V500, factor_key=TRUE)
  
  #save the ordered df
  var_name <- glue('ordered_{class}') # Construct the name
  assign(var_name, new_df, env=.GlobalEnv)
 
  #make ridgeplot
  theme_update(legend.text = element_text(size=8), 
      axis.text.x  = element_text(size=15),
      axis.text.y = element_text(size=10),
      text = element_text(size=15))
  ggplot(dataframe_long, aes(x=All_scores, y=Celltype, fill=Celltype)) + 
  geom_density_ridges(scale = 3, rel_min_height = 0.01, alpha= 0.5)  +
  theme(legend.position="none") +
  scale_color_manual(values=color_list[[1]]) +
  labs(title= glue('{class}: Top 40 Features'),x="Importance Score", y = "Proportion of RFs") +
  xlim(-10,100)
  
}

ridgeplot_classes(FeatureList_final, final_cNF, "final_Cutaneous Neurofibroma")

ridgeplot_classes(FeatureList_final, final_MPNST, "final_MPNST")

ridgeplot_classes(FeatureList_final, final_NF, "final_Neurofibroma")

ridgeplot_classes(FeatureList_final, final_pNF, "final_Plexiform Neurofibroma")

0.3 Plot the gene loadings the top 5 important LVs for each of the tumortypes

So we selected this restricted feature set for our later downstream analyses. The plots below show the gene loadings of the top important LVs.

0.3.1 Cutaneous Neurofibroma

## Plot the genes in the top LVs

plotGenes <- function(plot_data, list){
  
  ## Transform list to enable plotting
list$LV <- list$Celltype
list$LV <- gsub("`", "", list$LV)
##Function to extract numbers from data in columns
library(stringr)
numextract <- function(string){
  str_extract(string, "\\-*\\d+\\.*\\d*")
}
list$LatentVar <- numextract(list$LV)
list$LatentVar <- glue('V{list$LatentVar}')
  
  for (i in (unique(list$LatentVar)[1:10])) {
 
  tidy <- plot_data %>%
    dplyr::select(i) %>% 
    tibble::rownames_to_column('genes')
  
  p <- ggplot(tidy %>% top_n(30, get(i))) +
    aes(x=reorder(genes, -get(i)), y=get(i)) +
    geom_bar(stat="identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    ylim(c(0,8)) +
    xlab("Genes") +
    ylab("Loadings") +
    ggtitle(glue('Gene Loadings for interesting LV: {i}'))
  
  print(p)
  
  }

}

plotGenes(gene_by_lv, `ordered_final_Cutaneous Neurofibroma`)

## Warning: Removed 1 rows containing missing values (position_stack).

0.3.2 Neurofibroma

## Plot the genes in the top LVs

plotGenes(gene_by_lv, ordered_final_Neurofibroma)

0.3.3 Plexiform Neurofibroma

## Plot the genes in the top LVs

plotGenes(gene_by_lv, `ordered_final_Plexiform Neurofibroma`)

## Warning: Removed 2 rows containing missing values (position_stack).

## Warning: Removed 1 rows containing missing values (position_stack).

0.3.4 MPNST

## Plot the genes in the top LVs

plotGenes(gene_by_lv, ordered_final_MPNST)

## Warning: Removed 4 rows containing missing values (position_stack).

#save(FeatureList, perf_new, file = "RF_LV_Ensemble.Rdata")
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] glue_1.3.1                      gridExtra_2.3                  
##  [3] AnnotationDbi_1.46.1            IRanges_2.18.2                 
##  [5] S4Vectors_0.22.0                Biobase_2.44.0                 
##  [7] BiocGenerics_0.30.0             ggridges_0.5.1                 
##  [9] pheatmap_1.0.12                 AppliedPredictiveModeling_1.1-7
## [11] doParallel_1.0.15               iterators_1.0.12               
## [13] foreach_1.4.7                   caret_6.0-84                   
## [15] lattice_0.20-38                 e1071_1.7-2                    
## [17] randomForest_4.6-14             wesanderson_0.3.6              
## [19] RColorBrewer_1.1-2              colorspace_1.4-1               
## [21] DT_0.8                          forcats_0.4.0                  
## [23] stringr_1.4.0                   dplyr_0.8.0.1                  
## [25] purrr_0.3.2                     readr_1.3.1                    
## [27] tidyr_0.8.3                     tibble_2.1.1                   
## [29] ggplot2_3.1.1                   tidyverse_1.2.1                
## [31] BiocManager_1.30.4              synapserutils_0.1.6            
## [33] synapser_0.6.58                
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-139          bit64_0.9-7           lubridate_1.7.4      
##  [4] httr_1.4.0            tools_3.6.0           backports_1.1.4      
##  [7] R6_2.4.0              rpart_4.1-15          DBI_1.0.0            
## [10] lazyeval_0.2.2        nnet_7.3-12           withr_2.1.2          
## [13] tidyselect_0.2.5      bit_1.1-14            compiler_3.6.0       
## [16] cli_1.1.0             rvest_0.3.3           xml2_1.2.0           
## [19] labeling_0.3          scales_1.0.0          digest_0.6.18        
## [22] rmarkdown_1.12        pkgconfig_2.0.2       htmltools_0.3.6      
## [25] htmlwidgets_1.3       rlang_0.4.0           readxl_1.3.1         
## [28] RSQLite_2.1.2         rstudioapi_0.10       generics_0.0.2       
## [31] jsonlite_1.6          ModelMetrics_1.2.2    CORElearn_1.53.1     
## [34] magrittr_1.5          Matrix_1.2-17         Rcpp_1.0.1           
## [37] munsell_0.5.0         stringi_1.4.3         yaml_2.2.0           
## [40] MASS_7.3-51.1         plyr_1.8.4            recipes_0.1.6        
## [43] blob_1.2.0            grid_3.6.0            crayon_1.3.4         
## [46] haven_2.1.0           splines_3.6.0         hms_0.4.2            
## [49] zeallot_0.1.0         knitr_1.22            pillar_1.3.1         
## [52] reshape2_1.4.3        codetools_0.2-16      PythonEmbedInR_0.3.37
## [55] evaluate_0.13         data.table_1.12.2     modelr_0.1.4         
## [58] vctrs_0.2.0           cellranger_1.1.0      gtable_0.3.0         
## [61] assertthat_0.2.1      pack_0.1-1            xfun_0.6             
## [64] gower_0.2.1           prodlim_2018.04.18    broom_0.5.2          
## [67] class_7.3-15          survival_2.43-3       timeDate_3043.102    
## [70] memoise_1.1.0         ellipse_0.4.1         cluster_2.0.8        
## [73] lava_1.6.6            ipred_0.9-9